In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from db_utils import query_analytics_store,query_hive_ssh
import dateutil
In [2]:
response_table = 'log.QuickSurveysResponses_15266417'
impression_table = 'log.QuickSurveyInitiation_15278946'
host = 'analytics-store.eqiad.wmnet'
In [3]:
query = """
SELECT
COUNT(*) as n
FROM log.QuickSurveyInitiation_15278946
WHERE
event_eventName ='eligible'
"""
n_eligible = query_analytics_store(query, {}).iloc[0]['n']
query = """
SELECT
COUNT(*) as n
FROM log.QuickSurveyInitiation_15278946
WHERE
event_eventName ='impression'
"""
n_impressions = query_analytics_store(query, {}).iloc[0]['n']
query = """
SELECT
COUNT(*) as n
FROM log.QuickSurveysResponses_15266417
"""
n_responses = query_analytics_store(query, {}).iloc[0]['n']
query = """
SELECT
COUNT(*) as n
FROM log.QuickSurveysResponses_15266417
WHERE
event_surveyResponseValue ='ext-quicksurveys-external-survey-yes-button'
"""
n_yes = query_analytics_store( query, {}).iloc[0]['n']
query = """
SELECT
COUNT(*) as n
FROM log.QuickSurveysResponses_15266417
WHERE
event_surveyResponseValue ='ext-quicksurveys-external-survey-no-button'
"""
n_no = query_analytics_store( query, {}).iloc[0]['n']
print('Eligible Pageviews: ', n_eligible)
print('Widget Impressions: ', n_impressions)
print('Clicks: ', n_responses)
print('Yeses: ', n_yes)
print('Nos: ', n_no)
In [25]:
query = """
SELECT COUNT(DISTINCT(event_surveyInstanceToken)) as n
FROM log.QuickSurveyInitiation_15278946
WHERE event_eventName ='eligible'
"""
print(query_analytics_store(query, {}).iloc[0]['n'])
Pretty Close
In [26]:
query = """
SELECT COUNT(DISTINCT(event_surveyInstanceToken)) as n
FROM log.QuickSurveyInitiation_15278946
WHERE event_eventName ='impression'
"""
print(query_analytics_store(query, {}).iloc[0]['n'])
Pretty Close.
In [27]:
query = """
SELECT
COUNT(DISTINCT(i.event_surveyInstanceToken)) as n
FROM
(SELECT event_surveyInstanceToken
FROM log.QuickSurveyInitiation_15278946
WHERE
event_eventName ='impression') i
JOIN
(SELECT event_surveyInstanceToken
FROM log.QuickSurveyInitiation_15278946
WHERE
event_eventName ='eligible') e
ON
(i.event_surveyInstanceToken = e.event_surveyInstanceToken)
"""
df = query_analytics_store(query, {})
print(df.iloc[0]['n'])
Close. It seems there are a small number of impressions without a matching pageview.
In [28]:
query = """
SELECT *
FROM
log.QuickSurveysResponses_15266417 r,
log.QuickSurveyInitiation_15278946 i
WHERE
r.event_surveyInstanceToken = i.event_surveyInstanceToken
AND i.event_eventName ='impression'
"""
d_click = query_analytics_store( query, {})
print(d_click.shape[0], len(set(d_click['event_surveyInstanceToken'])))
Each click has exactly one widget impression. One click without a matching rendered widget
In [43]:
d_survey = pd.read_csv('responses.tsv', sep = '\t')
st = 'This is you survey ID. Please do not modify.'
ct = 'event_surveyInstanceToken'
df_survey_click = d_survey.merge(d_click, how = 'inner', right_on = ct, left_on = st)
In [44]:
print(df_survey_click.shape[0])
In [45]:
print(d_survey.shape[0])
In [46]:
d_survey['Timestamp'] = d_survey['Timestamp'].apply(lambda x: dateutil.parser.parse(x))
In [54]:
df_survey_click['Timestamp'] = df_survey_click['Timestamp'].apply(lambda x: dateutil.parser.parse(x))
In [55]:
d_survey[d_survey['Timestamp'] > '2016-02-25 16:20'][d_survey['Timestamp'] < '2016-02-25 17:10'].shape #12:20 1:08
Out[55]:
In [57]:
df_survey_click[df_survey_click['Timestamp'] > '2016-02-25 16:20'][df_survey_click['Timestamp'] < '2016-02-25 17:10'].shape #12:20 1:08
Out[57]:
In [18]:
query = """
SELECT
event_surveyInstanceToken
FROM log.QuickSurveyInitiation_15278946
WHERE
event_eventName ='impression'
"""
d_impression = query_analytics_store(query, {})
df_survey_impression = d_survey.merge(d_impression, how = 'inner', right_on = 'event_surveyInstanceToken', left_on = st)
In [19]:
print(df_survey_impression.shape[0])
Only 61 out of 81 survey responses have a matching click.
In [20]:
query = """
SELECT
event_surveyInstanceToken
FROM log.QuickSurveyInitiation_15278946
WHERE
event_eventName ='eligible'
"""
d_pageview = query_analytics_store(query, {})
df_survey_pageview = d_survey.merge(d_pageview, how = 'inner', right_on = 'event_surveyInstanceToken', left_on = st)
In [21]:
print(df_survey_pageview.shape[0])
In [21]:
print('Eligible Pageviews: ', n_eligible)
print('Widget Impressions: ', n_impressions)
print('Clicks: ', n_responses)
print('Yeses: ', n_yes)
print('Google Responses tracked in EL', df_survey_click.shape[0])
print('Google Responses', d_survey.shape[0])
In [22]:
df_survey_click2 = d_survey.merge(d_click, how = 'left', right_on = ct, left_on = st)
In [23]:
df_survey_click2 = df_survey_click2[[st,'Timestamp', ct, 'timestamp']].sort('Timestamp')
df_survey_click2.columns = ['Google Token', 'Google Timestamp', 'EL Token', 'EL Timestamp']
In [24]:
df_survey_click2['Google Token'].apply(len).value_counts()
Out[24]:
No obvious pattern...
In [25]:
query = """
SELECT
COUNT(DISTINCT(clientIP)) as n
FROM log.QuickSurveyInitiation_15278946
"""
nIPs = query_analytics_store( query, {}).iloc[0]['n']
print(nIPs)
In [26]:
query = """
SELECT
COUNT(DISTINCT(CONCAT(clientIP, userAgent))) as n
FROM log.QuickSurveyInitiation_15278946
"""
nIPUAs = query_analytics_store( query, {}).iloc[0]['n']
print(nIPUAs)
In [27]:
query = """
SELECT
COUNT(DISTINCT(event_surveySessionToken)) as n
FROM log.QuickSurveyInitiation_15278946
"""
sessiontokens = query_analytics_store( query, {}).iloc[0]['n']
print(sessiontokens)
In [109]:
query = """
SELECT COUNT(*) as n, userAgent
FROM log.QuickSurveyInitiation_15278946
WHERE event_eventName ='eligible'
GROUP BY userAgent
"""
el_ua_df = query_analytics_store( query, {})
el_ua_df['proportion'] = el_ua_df['n'] / el_ua_df['n'].sum()
el_ua_df['userAgent'] = el_ua_df['userAgent'].apply(lambda x: x[1:-1])
In [110]:
query = """
SELECT SUM(n) as n FROM ellery.ua_counts
"""
dn = query_hive_ssh( query, 'temp.txt')
query = """
SELECT n, user_agent
FROM ellery.ua_counts
ORDER BY n DESC
LIMIT 1000
"""
webrequest_ua_df = query_hive_ssh( query, 'temp.txt')
webrequest_ua_df['proportion'] = webrequest_ua_df['n'] / dn.iloc[0]['n']
In [111]:
ua_merge = webrequest_ua_df.merge(el_ua_df, how ='left', left_on = 'user_agent', right_on = 'userAgent' )
ua_merge.fillna(0, inplace = True)
ua_merge['delta'] = ua_merge['proportion_x'] - ua_merge['proportion_y']
ua_merge.sort('delta', inplace = True, ascending = False)
In [116]:
for i, r in ua_merge[['user_agent', 'delta']].head(25).iterrows():
print(r['user_agent'], r['delta'])
In [7]:
query = """
SELECT
webHost,
page_title as title,
event_surveyInstanceToken as survey_token,
clientIp,
userAgent,
CAST(DATE_FORMAT(STR_TO_DATE(timestamp, '%%Y%%m%%d%%H%%i%%S'), '%%Y-%%m-%%d %%H:%%i:%%S') AS CHAR(22) CHARSET utf8) AS timestamp
FROM
log.QuickSurveysResponses_15266417 s,
enwiki.page p
WHERE
p.page_id = s.event_pageId
AND s.event_surveyResponseValue ='ext-quicksurveys-external-survey-yes-button'
"""
df = query_analytics_store( query, {})
In [9]:
(df['clientIp'] + df['userAgent']).value_counts().value_counts()
Out[9]:
In [ ]: